class Agent:
def __init__(self):
#self.actions = ["up","down","left","right"]
self.actions = ["↑","↓","←","→"]
self.pos = (0,0)
agent = Agent()
size = 35
class State:
def __init__(self,actions):
self.Q = {}
for a in actions:
self.Q[a] = 0.0
self.reward = 1
self.best_action = "↓"
self.goal = False
def set_goal(self,actions):
for a in actions:
self.Q[a] =0.0
self.goal = True
states = [[State(agent.actions) for i in range(size)] for j in range(size)]
goal_pos = 26
states[goal_pos][goal_pos].set_goal(agent.actions)
obs_box_pos = 10
obs_box_size = 10
def set_obstacle(pos, size):
for x in range(pos, pos + size):
for y in range(pos, pos + size):
states[x][y].reward += -3
set_obstacle(obs_box_pos, obs_box_size)
import matplotlib.pyplot as plt
import matplotlib.patches as patches
def draw(mark_pos):
fig, ax = plt.subplots(figsize=(20,20))
values = [[states[i][j].Q[states[i][j].best_action] for i in range(size)] for j in range(size)]
mp = ax.pcolor(values, cmap=plt.cm.YlOrRd,vmin=0,vmax=8)
ax.set_aspect(0.6)
ax.set_xticks(range(size), minor=False)
ax.set_yticks(range(size), minor=False)
for x in range(len(values)):
for y in range(len(values[0])):
s = states[x][y]
if states[x][y].goal:
plt.text(x+0.5, y+0.75, "Goal",ha='center', va='center', size=10)
else:
plt.text(x+0.5, y+0.75, int(100*s.Q[s.best_action])/100, ha='center', va='center', size=8)
plt.text(agent.pos[0]+0.5, agent.pos[1]+0.25, "agent", ha='center', va='center', size=10)
if mark_pos == "all": # 指定した位置にactionの文字列を書くという処理
for x in range(size):
for y in range(size):
if states[x][y].goal: continue
plt.text(x+0.5, y+0.25, states[x][y].best_action,ha='center', va='center', size=10)
elif mark_pos != None:
s = states[mark_pos[0]][mark_pos[1]]
plt.text(mark_pos[0]+0.5, mark_pos[1]+0.25, s.best_action,ha='center', va='center', size=10)
plt.show()
fig.clear()
draw(None)
import random
def state_transition(s_pos,a):
###確率10%で元のまま ###
if random.uniform(0,1) < 0.1:
return s_pos
x,y = s_pos
if a == "↑": y += 1
elif a == "↓": y += -1
elif a == "→": x += 1
elif a == "←": x += -1
if x < 0: x = 0
elif x >= size: x = size-1
if y < 0: y = 0
elif y >= size: y = size-1
return (x,y)
def e_greedy(s):
if random.uniform(0,1) < 0.1: #10%でランダムに
return random.choice(agent.actions)
else:
best_a = None
best_q = 1000000000
for a in s.Q:
if best_q > s.Q[a]:
best_q = s.Q[a]
best_a = a
s.best_action = best_a
return best_a
1000000回のトライでゴールに到達したエージェントのみプロットする
alpha = 0.6
gamma = 0.9
def sarsa(s_pos,a):
s = states[s_pos[0]][s_pos[1]]
s_next_pos = state_transition(s_pos,a)
s_next = states[s_next_pos[0]][s_next_pos[1]]
a_next = e_greedy(s_next)
q = (1.0-alpha)*s.Q[a] + alpha * (s_next.reward + gamma * s_next.Q[a_next])
#print("s:" + str(s_pos)+ " a:" + a + " s':" + str(s_next_pos) + " a':" + a_next)
#print("----")
return s_next_pos, a_next, q
def one_trial():
agent.pos = (random.randrange(size),random.randrange(size))
a = e_greedy(states[agent.pos[0]][agent.pos[1]])
if states[agent.pos[0]][agent.pos[1]].goal:
return
for i in range(1000000):
#draw(None)
s_next, a_next, q = sarsa(agent.pos,a)
states[agent.pos[0]][agent.pos[1]].Q[a] = q
agent.pos = s_next
a = a_next
if states[agent.pos[0]][agent.pos[1]].goal:
draw("all")
break
one_trialの回数を決める
#from tqdm._tqdm_notebook import tqdm_notebook
from tqdm import tqdm
max_step = 128
for i in tqdm(range(max_step)):
one_trial()
draw("all")